In [1]:
#pip install pandas-profiling
In [2]:
#pip install python-docx 
In [3]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from pandas_profiling import ProfileReport
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import os
from scipy.stats import pearsonr
In [4]:
original_data= pd.read_csv('Original Dataset_30 Variables_Not to be Shared.csv')
# original_report = ProfileReport(original_data, title='Original Data')
# original_report.to_file("original_report.html")
In [5]:
synthetic_data= pd.read_csv('CTGAN Generated data.csv')
# synthetic_data_report = ProfileReport(synthetic_data, title='Synthetic Data')
# synthetic_data_report.to_file("SyntheticData_report.html")
In [6]:
# comparison_report = original_report.compare(synthetic_data_report)
# comparison_report.to_file("original_vs_transformed.html")
In [7]:
## Distribution plots to compare original vs synthetic data vissually
def plot_distplot_pairs(df1, df2, data1_name, data2_name, save_path):
    # Set the seaborn theme
    sns.set_theme(style='whitegrid')
    
    # Get the common columns between the two dataframes
    common_columns = df1.columns.intersection(df2.columns)
    
    # Create the directory if it doesn't exist
    os.makedirs(save_path, exist_ok=True)
    
    # Create a zip file to store the plots
    with zipfile.ZipFile(f'{save_path}/distplots.zip', 'w') as zipf:
        # Iterate over each common column
        for column in common_columns:
            # Create subplots for each pair of distplots
            fig, axs = plt.subplots(1, 2, figsize=(10, 4))
            
            # Plot distplot for df1
            sns.distplot(df1[column], ax=axs[0])
            axs[0].set_title(f'Distribution of {column} ({data1_name})')
            
            # Plot distplot for df2
            sns.distplot(df2[column], ax=axs[1])
            axs[1].set_title(f'Distribution of {column} ({data2_name})')
            
            # Adjust spacing between subplots
            plt.tight_layout()
            
            # Save the plot as an image
            plot_filename = f'{column}.png'
            plot_path = f'{save_path}/{plot_filename}'
            plt.savefig(plot_path)
            
            # Add the saved image to the zip file
            zipf.write(plot_path, arcname=plot_filename)
            
            # Show the plot
            plt.show()
            
            # Close the plot
            plt.close()
    
    print(f"Plots saved as '{save_path}/distplots.zip'")


data1_name = "Original Data"
data2_name = "Synthetic Data"
save_path = "plots_directory"
plot_distplot_pairs(original_data, synthetic_data, data1_name, data2_name, save_path)
Plots saved as 'plots_directory/distplots.zip'
In [8]:
# Dataframe Summary comparison
def calculate_summary_statistics(data):
    summary_stats = data.describe().transpose()
    summary_stats['median'] = data.median()
    return summary_stats[['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'median']]

def compare_summary_statistics(original_data, synthetic_data):
    original_stats = calculate_summary_statistics(original_data)
    synthetic_stats = calculate_summary_statistics(synthetic_data)
    
    comparison_table = pd.concat([original_stats, synthetic_stats], axis=1, keys=['Original Data', 'Synthetic Data'])
    return comparison_table


summary_comparison_table = compare_summary_statistics(original_data, synthetic_data)
summary_comparison_table
Out[8]:
Original Data Synthetic Data
count mean std min 25% 50% 75% max median count mean std min 25% 50% 75% max median
Feed N Plus 2A content 7882.0 47.976056 2.037363 40.38 47.0200 48.150 49.3400 59.89 48.150 5962.0 47.972068 2.046191 40.38 47.0000 48.150 49.3800 59.89 48.150
Reactor WAIT 7882.0 982.706760 14.361024 956.40 968.5400 980.920 993.3200 1009.66 980.920 5962.0 982.563648 14.284562 963.04 968.5300 980.890 991.9275 1009.49 980.890
H2 to HC 7882.0 3.821718 0.372182 2.76 3.5600 3.780 4.0300 7.79 3.780 5962.0 3.814648 0.335320 3.10 3.5600 3.770 4.0300 5.66 3.770
Reactor 1 Inlet Temp 7882.0 975.360089 13.693660 941.75 964.7900 971.180 984.7675 1005.35 971.180 5962.0 975.295392 13.532721 945.74 964.7700 970.845 984.4475 1004.35 970.845
Reactor 2 Inlet Temp 7882.0 982.323996 15.599825 948.76 966.4900 982.290 994.5375 1016.96 982.290 5962.0 982.172801 15.434145 951.40 966.4700 981.850 994.2700 1015.79 981.850
Reactor 3 Inlet Temp 7882.0 983.784026 14.004152 958.95 970.5525 982.060 996.0050 1013.95 982.060 5962.0 983.606872 13.933120 960.06 970.4725 981.610 995.4500 1009.32 981.610
Reactor 4 Inlet Temp 7882.0 986.966524 15.722209 962.90 971.6300 986.580 996.5875 1019.20 986.580 5962.0 986.789906 15.674754 964.35 971.5500 986.385 996.1625 1015.62 986.385
Reactor 1 Delta T 7882.0 160.881586 7.579552 129.35 156.0400 161.110 166.4575 178.08 161.110 5962.0 161.030943 7.303361 134.38 156.2325 161.160 166.4075 176.95 161.160
Reactor 2 Delta T 7882.0 106.128895 6.387439 75.37 101.5000 106.155 110.4900 132.41 106.155 5962.0 106.133452 6.186524 86.24 101.5600 106.120 110.4500 128.67 106.120
Reactor 3 Delta T 7882.0 71.011284 4.095590 48.10 69.4400 71.310 73.1800 85.91 71.310 5962.0 71.095792 3.766175 52.29 69.5300 71.350 73.1375 83.26 71.350
Reactor 4 Delta T 7882.0 41.694635 4.379323 21.24 39.5500 42.500 44.7200 58.45 42.500 5962.0 41.762420 4.140208 22.69 39.6700 42.580 44.6700 51.74 42.580
Reactor 1 Delta P 7882.0 0.990741 0.345861 0.16 0.6500 1.090 1.3100 1.99 1.090 5962.0 0.992466 0.341575 0.45 0.6500 1.100 1.3100 1.88 1.100
Reactor 2 Delta P 7882.0 2.588273 0.093082 2.28 2.5200 2.570 2.6700 2.83 2.570 5962.0 2.587736 0.091843 2.33 2.5200 2.570 2.6700 2.82 2.570
Reactor 3 Delta P 7882.0 2.642166 0.135004 2.03 2.5400 2.660 2.7500 2.96 2.660 5962.0 2.640976 0.132266 2.15 2.5300 2.650 2.7500 2.92 2.650
Reactor 4 Delta P 7882.0 2.804509 0.241722 2.08 2.6400 2.840 2.9800 3.30 2.840 5962.0 2.805659 0.240884 2.13 2.6400 2.840 2.9800 3.28 2.840
Seperator Pressure 7882.0 33.333715 1.339706 20.93 32.4900 33.460 34.2700 37.05 33.460 5962.0 33.360704 1.310147 28.65 32.5100 33.490 34.3000 36.50 33.490
Seperator Temperature 7882.0 105.665434 4.046135 89.24 103.2600 105.760 108.3100 120.44 105.760 5962.0 105.697828 3.924914 92.96 103.2900 105.805 108.3075 119.30 105.805
Recycle gas purity 7882.0 81.346800 9.148243 0.86 80.8800 82.550 83.4500 86.82 82.550 5962.0 81.232031 9.579163 0.86 80.8800 82.550 83.4500 86.82 82.550
Net gas Hydrogen Purity 7882.0 89.323787 1.219458 86.11 88.2500 89.580 90.4400 91.40 89.580 5962.0 89.338293 1.220673 86.11 88.2500 89.590 90.4400 91.40 89.590
Coke on Spent Catalyst 7882.0 3.181111 0.821481 1.82 2.5000 3.210 3.8400 4.65 3.210 5962.0 3.165453 0.819124 1.82 2.5000 3.150 3.8300 4.65 3.150
Chloride Injection rate 7882.0 2.374787 0.343244 0.00 2.0800 2.490 2.5800 10.02 2.490 5962.0 2.385143 0.296423 0.84 2.1025 2.510 2.5800 3.16 2.510
Total Paraffins in feed 7882.0 64.251845 60.531753 48.92 60.8300 62.380 63.8600 2093.11 62.380 5962.0 62.431676 2.201652 48.92 60.8300 62.370 63.8600 69.95 62.370
Total Naphthenes in feed 7882.0 26.427558 2.739174 18.94 24.0900 26.810 28.4800 41.15 26.810 5962.0 26.454210 2.749912 18.94 24.1400 26.860 28.4900 41.15 26.860
Total Aromatics in feed 7882.0 10.773975 1.065247 7.19 9.9700 10.790 11.5600 13.39 10.790 5962.0 10.758345 1.069349 7.19 9.9600 10.770 11.5000 13.39 10.770
Total olefins in Feed 7882.0 0.208891 0.226918 0.03 0.1000 0.120 0.1900 1.56 0.120 5962.0 0.206850 0.213717 0.05 0.1000 0.120 0.2000 1.48 0.120
Reactor LHSV 7882.0 1.564491 0.085537 1.09 1.5000 1.570 1.6200 1.74 1.570 5962.0 1.566602 0.080772 1.17 1.5000 1.570 1.6200 1.74 1.570
Feed IBP 7882.0 198.726871 6.565372 186.23 194.0600 196.850 200.3200 217.40 196.850 5962.0 198.658737 6.455776 186.23 193.9700 196.850 200.0900 217.40 196.850
50% IBP 7882.0 252.215292 2.962630 215.23 252.2700 252.560 252.9000 258.50 252.560 5962.0 252.245359 2.724750 215.23 252.2700 252.560 252.8900 258.50 252.560
WABT 7882.0 937.694057 15.687581 914.14 921.5100 936.035 947.5275 969.60 936.035 5962.0 937.485580 15.588433 915.36 921.4800 935.890 947.2200 967.72 935.890
Plant C5PlusYield 7882.0 84.979594 1.235512 79.48 84.0700 85.240 85.8400 88.79 85.240 5962.0 84.994794 1.209658 79.48 84.1300 85.260 85.8300 88.79 85.260
In [9]:
# Compute correlation matrix for original_data
original_corr = original_data.corr()

# Compute correlation matrix for synthetic_data
synthetic_corr = synthetic_data.corr()

# Plot correlation matrix for original_data
plt.figure(figsize=(20, 16))
sns.heatmap(original_corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix - Original Data')
plt.show()

# Plot correlation matrix for synthetic_data
plt.figure(figsize=(20, 16))
sns.heatmap(synthetic_corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix - Synthetic Data')
plt.show()
In [10]:
# Extract the "Plant C5PlusYield" column from both dataframes
original_target = original_data["Plant C5PlusYield"]
synthetic_target = synthetic_data["Plant C5PlusYield"]

# Initialize an empty dataframe to store the correlation results
correlation_df = pd.DataFrame(columns=["Column-Pair", "Original Correlation", "Synthetic Correlation"])

# Iterate through each column in the original_data dataframe
for column in original_data.columns:
    # Skip the "Plant C5PlusYield" column
    if column == "Plant C5PlusYield":
        continue
    
    # Calculate the Pearson correlation for the original_data dataframe
    original_corr, _ = pearsonr(original_data[column], original_target)
    
    # Calculate the Pearson correlation for the synthetic_data dataframe
    synthetic_corr, _ = pearsonr(synthetic_data[column], synthetic_target)
    
    # Append the results to the correlation dataframe
    correlation_df = correlation_df.append({
        "Column-Pair": column,
        "Original Correlation": original_corr,
        "Synthetic Correlation": synthetic_corr
    }, ignore_index=True)

# Print the correlation dataframe
correlation_df
Out[10]:
Column-Pair Original Correlation Synthetic Correlation
0 Feed N Plus 2A content 0.382200 0.390723
1 Reactor WAIT -0.705501 -0.707660
2 H2 to HC 0.072479 0.090593
3 Reactor 1 Inlet Temp -0.708477 -0.718298
4 Reactor 2 Inlet Temp -0.673569 -0.673391
5 Reactor 3 Inlet Temp -0.698320 -0.697402
6 Reactor 4 Inlet Temp -0.658659 -0.664151
7 Reactor 1 Delta T -0.340005 -0.353268
8 Reactor 2 Delta T -0.401247 -0.407639
9 Reactor 3 Delta T 0.393166 0.408004
10 Reactor 4 Delta T 0.602311 0.616397
11 Reactor 1 Delta P -0.441016 -0.446243
12 Reactor 2 Delta P -0.529404 -0.539138
13 Reactor 3 Delta P -0.152238 -0.163390
14 Reactor 4 Delta P -0.243650 -0.243682
15 Seperator Pressure 0.458794 0.480809
16 Seperator Temperature 0.335676 0.335589
17 Recycle gas purity 0.078186 0.077599
18 Net gas Hydrogen Purity 0.465102 0.468186
19 Coke on Spent Catalyst -0.251571 -0.241133
20 Chloride Injection rate -0.009619 -0.034867
21 Total Paraffins in feed 0.002509 -0.333527
22 Total Naphthenes in feed 0.062026 0.075786
23 Total Aromatics in feed 0.287626 0.277741
24 Total olefins in Feed 0.392807 0.399858
25 Reactor LHSV 0.198023 0.193197
26 Feed IBP 0.360778 0.357431
27 50% IBP 0.329538 0.362343
28 WABT -0.714266 -0.716756
In [11]:
# Set the column pair as the index for the correlation table
correlation_df.set_index('Column-Pair', inplace=True)

# Plot the correlation values
correlation_df.plot(kind='bar', figsize=(18, 9))
plt.title('Correlation Comparison')
plt.xlabel('Column-Pair')
plt.ylabel('Pearson Correlation')
plt.xticks(rotation=45)
plt.legend(['Original Data', 'Synthetic Data'])
plt.show()